library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(data.table)
library(purrr)
library(stringr)
library(plm)
library(lmtest)
library(sandwich)

####################
# Data Processing ##
####################
#vdem <- read.csv("V-Dem-CY-Full+Others-v14.csv")
acled <- read.csv("1997-02-01-2025-02-01-Eastern_Africa-Middle_Africa-Western_Africa.csv")

acled$protest <- ifelse(acled$event_type == "Protests", 1, 0)
acled$civilian <- ifelse(acled$civilian_targeting == "Civilian targeting", 1, 0)
acled$state_forces <- ifelse(grepl("State forces", acled$interaction), 1, 0)

acled <- acled %>% 
  filter(state_forces == 1)

vdem.sel <- vdem %>% 
  dplyr::select(country_id, country_name, year, COWcode, v2xel_locelec, 
                v2x_polyarchy, e_gdppc, e_pop) %>% 
  dplyr::rename("ccode" = COWcode) %>% 
  dplyr::mutate("styr" = year) %>% 
  dplyr::rename("country" = country_name)

acled <- left_join(acled, vdem.sel, by = c("country", "year"))

acled$event_date <- as.Date(acled$event_date, format = "%d %B %Y")

# Make sure the data is sorted properly
acled <- acled %>%
  arrange(country, event_date)

setDT(acled)
setorder(acled, country, event_date)

# Create the weighted average
acled <- acled %>%
  arrange(country, event_date) %>%
  group_by(country) %>%
  mutate(cumulated_fatal_weighted = map_dbl(row_number(), function(i) {
    if (i > 1) {
      days_diff <- as.numeric(difftime(event_date[i], event_date[1:(i - 1)], units = "days"))
      weights <- 1 - (days_diff / max(days_diff))  # linear weights
      weighted.mean(fatalities[1:(i - 1)], w = weights, na.rm = TRUE)
    } else {
      NA_real_
    }
  })) %>%
  ungroup()

# Define decay parameter
half_life <- 730
lambda <- log(2) / half_life

# Create exponential decay weighted average
acled <- acled %>%
  arrange(country, event_date) %>%
  group_by(country) %>%
  mutate(cumulated_fatal_weighted_exp = map_dbl(row_number(), function(i) {
    if (i > 1) {
      days_diff <- as.numeric(difftime(event_date[i], event_date[1:(i - 1)], units = "days"))
      weights_exp <- exp(-lambda * days_diff)
      weighted.mean(fatalities[1:(i - 1)], w = weights_exp, na.rm = TRUE)
    } else {
      NA_real_
    }
  })) %>%
  ungroup()

acled <- acled %>%
  mutate(
    participants_raw = str_extract(tags, "\\d{1,3}(,\\d{3})*|\\d+"),
    
    participants = case_when(
      str_detect(tags, regex("hundreds of thousands", ignore_case = TRUE)) ~ "500000",
      str_detect(tags, regex("tens of thousands", ignore_case = TRUE)) ~ "50000",
      str_detect(tags, regex("thousands", ignore_case = TRUE)) ~ "5000",
      str_detect(tags, regex("hundreds", ignore_case = TRUE)) ~ "500",
      str_detect(tags, regex("dozens", ignore_case = TRUE)) ~ "50",
      str_detect(tags, regex("scores", ignore_case = TRUE)) ~ "40",
      str_detect(tags, regex("tens", ignore_case = TRUE)) ~ "20",
      str_detect(tags, regex("a handful|few people|several", ignore_case = TRUE)) ~ "5",
      str_detect(tags, regex("massive|immense|huge|large numbers|large crowd|big numbers", ignore_case = TRUE)) ~ "10000",
      str_detect(tags, regex("no report|not reported|unknown|unreported|no response|no record", ignore_case = TRUE)) ~ NA_character_,
      
      str_detect(tags, "30-50") ~ "40",
      str_detect(tags, "40 to 50") ~ "45",
      str_detect(tags, "20 to 30") ~ "25",
      str_detect(tags, regex("between.*30.*40", ignore_case = TRUE)) ~ "35",
      str_detect(tags, regex("between.*100.*200", ignore_case = TRUE)) ~ "150",
      str_detect(tags, regex("between.*300.*500", ignore_case = TRUE)) ~ "400",
      str_detect(tags, regex("between dozens and several hundred", ignore_case = TRUE)) ~ "200",
      
      TRUE ~ participants_raw
    ),
    participants = str_remove_all(participants, ","),
    participants = as.numeric(participants)
  )

acled_meaned <- acled %>%
  group_by(country, event_date) %>%
  dplyr::summarise(
    participants_mean = mean(participants, na.rm = TRUE),
    fatalities_mean = mean(fatalities, na.rm = TRUE),
    protest = mean(protest, na.rm = TRUE),
    civilian = mean(civilian, na.rm = TRUE),
    v2x_polyarchy = mean(v2x_polyarchy, na.rm = TRUE),
    gdppc = mean(e_gdppc, na.rm = TRUE),
    pop = mean(e_pop, na.rm = TRUE),
    state_forces = mean(state_forces, na.rm = TRUE),
    cumulated_fatal_weighted = mean(cumulated_fatal_weighted, na.rm = TRUE),
    cumulated_fatal_weighted_exp = mean(cumulated_fatal_weighted_exp, na.rm = TRUE),  # 추가됨
    v2xel_locelec = mean(v2xel_locelec, na.rm = TRUE),
    year = mean(year, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  dplyr::select(participants_mean, fatalities_mean,
                protest, civilian,
                country, event_date, v2x_polyarchy, gdppc, pop,
                state_forces, cumulated_fatal_weighted, cumulated_fatal_weighted_exp,
                v2xel_locelec, year)

acled_meaned <- acled_meaned %>%
  arrange(country, event_date) %>%
  group_by(country) %>%
  mutate(
    lag_cumulated_fatal_weighted = dplyr::lag(cumulated_fatal_weighted),
    fatality_dummy_weighted = ifelse(fatalities_mean > lag_cumulated_fatal_weighted, 1, 0),
    fatality_diff_weighted = fatalities_mean - lag_cumulated_fatal_weighted,
    participants_lead = dplyr::lead(participants_mean)
  ) %>%
  ungroup() %>%
  mutate(
    fatality_diff_signed_weighted = sign(fatality_diff_weighted) * log1p(abs(fatality_diff_weighted))
  )

acled_meaned <- acled_meaned %>%
  arrange(country, event_date) %>%
  group_by(country) %>%
  mutate(
    lag_cumulated_fatal_weighted_exp = dplyr::lag(cumulated_fatal_weighted_exp),
    fatality_diff_weighted_exp = fatalities_mean - lag_cumulated_fatal_weighted_exp,
    fatality_diff_signed_weighted_exp = sign(fatality_diff_weighted_exp) * log1p(abs(fatality_diff_weighted_exp))
  ) %>%
  ungroup()

acled_meaned <- acled_meaned %>% 
  mutate(fatalities_log = log1p(fatalities_mean)) %>%
  mutate(participants_lead = log1p(participants_lead)) %>% 
  mutate(gdppc = log1p(gdppc)) %>% 
  mutate(pop = log1p(pop))

pdata <- pdata.frame(acled_meaned, index = c("country", "event_date"))

write.csv(pdata, "acled_country_final.csv", row.names = FALSE)
